Introduction

This R Markdown notebook is used to document the various aspects of the genotype-phenotype analysis in many subjects with hereditary hearing loss based on mutation in the DFNA9 gene. We have data collected […].

Load R-packages

library(ggplot2)
library(drc)
## Loading required package: MASS
## 
## 'drc' has been loaded.
## Please cite R and 'drc' if used for a publication,
## for references type 'citation()' and 'citation('drc')'.
## 
## Attaching package: 'drc'
## The following objects are masked from 'package:stats':
## 
##     gaussian, getInitial
library(sjPlot)
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## #refugeeswelcome
library("readxl")
library("nlme")
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'lme4'
## The following object is masked from 'package:nlme':
## 
##     lmList
library(knitr)
library(forcats)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
## 
##     collapse
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load data and clean data-frames

Load data from Excel file and select only relevant columns/rows. The first analyses will be based on pure-tone average (PTA). The selected subset dataframe consists of the columns patient id (pid), group, age (Leeftijd), and the PTA (PTA54ADS).

data_raw <- read_excel("../data/raw_data/database_20-04-2020.xlsx")
data_raw$group = factor(data_raw$Domain)
#leave out data with only n=1 dataset per domain/certain unpublished data.
data_subset <- subset(data_raw, Smits=='no' & Domainrec!=1 & group!="Ivd1")  
data <- subset(data_subset, select=c('pid', 'group', 'Leeftijd', 'PTA54ADS'))
data <- droplevels(data)
#data_raw %>% group_by(group) %>% summarize(count=n())
#head(data)
#check for NaNs in PTA and Leeftijd, should be 0.
nrow(data[is.na(data$PTA54ADS) | is.na(data$Leeftijd), ])
## [1] 0
#number (n) of counts (i.e. audiograms) per subject (pid)
summarytable <- data %>% count(group, pid)

Group description

In the group of DFNA9 patients we have some for which there is longitudinal data, i.e. multiple audiograms over time/age (Leeftijd). First, create a table and histogram of number of measurements for each subject id (pid) across the groups

num_meas_per_id <- aggregate(PTA54ADS ~ pid , data, function(x) length(unique(x)))
t1 <- table(num_meas_per_id$PTA54ADS)

In total there are 283 subjects with 716 measurements; 159 patients with only 1 measurement and 123 patients with 2 or more measurements, see e.g. table 1 or the histogram.

kable(t1, caption = "Table 1. The number of subjects that each have n audiograms", col.names = c("# audiograms","# subjects"))
Table 1. The number of subjects that each have n audiograms
# audiograms # subjects
1 159
2 47
3 21
4 10
5 13
6 11
7 4
8 7
9 3
10 3
11 3
12 1
15 1

Now make a histogram of the number of audiograms across patients in each of the groups

ggplot(data=summarytable, aes(x=n,fill=group)) + 
  geom_histogram(binwidth=1) +
  facet_wrap(~group) +
  xlab("# of measurements") +
  ylab("Frequency (# patients)") 

  dev.print(pdf, '../results/histogram_number_meas_pid.pdf')
## quartz_off_screen 
##                 2

Relation of PTA with age for the different groups; connecting lines show longitudinal data of patients’ PTA over time

ggplot(data, aes(x = Leeftijd, y = PTA54ADS, group = pid, color = group)) + 
  #xlab("Age (years)") +
  #ylab("PTA (dB HL)") +
  geom_point(aes(colour = factor(group))) +
  geom_line(data=data, size=1, alpha = .3) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 

  dev.print(pdf, '../results/pta_age_pid_groups.pdf')
## quartz_off_screen 
##                 2

Logistic fit of PTA with age

Perform non-linear fit; first focus on LCCL domain.

lccl = subset(data, group=="LCCL")
ggplot(lccl, aes(x=Leeftijd, y=PTA54ADS), group=pid, color=group) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  geom_point(aes(colour = factor(group))) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  theme_light()

  dev.print(pdf, '../results/pta_age_pid_lccl.pdf')
## quartz_off_screen 
##                 2
lin_fit <- nls(PTA54ADS ~ a*Leeftijd + b, lccl)
## Warning in nls(PTA54ADS ~ a * Leeftijd + b, lccl): No starting values specified for some parameters.
## Initializing 'a', 'b' to '1.'.
## Consider specifying 'start' or using a selfStart model
summary(lin_fit)
## 
## Formula: PTA54ADS ~ a * Leeftijd + b
## 
## Parameters:
##    Estimate Std. Error t value Pr(>|t|)    
## a   1.60851    0.05331  30.170   <2e-16 ***
## b -28.43929    2.89132  -9.836   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.01 on 673 degrees of freedom
## 
## Number of iterations to convergence: 1 
## Achieved convergence tolerance: 9.99e-10
nls_fit <- nls(PTA54ADS ~ a*Leeftijd^b, lccl, start = list(a = 0.05, b = 1.5))
summary(nls_fit)
## 
## Formula: PTA54ADS ~ a * Leeftijd^b
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a  0.07298    0.01859   3.926 9.51e-05 ***
## b  1.66646    0.06175  26.988  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.54 on 673 degrees of freedom
## 
## Number of iterations to convergence: 5 
## Achieved convergence tolerance: 2.93e-06
startvec <- c(Asym = 120, xmid = 50, scal = 15)
nls_logis<- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal),
                 data=lccl,
                 start = startvec)
summary(nls_logis)
## 
## Formula: PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal)
## 
## Parameters:
##      Estimate Std. Error t value Pr(>|t|)    
## Asym  126.351      9.477   13.33   <2e-16 ***
## xmid   56.974      2.619   21.76   <2e-16 ***
## scal   15.410      1.371   11.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.29 on 672 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 3.603e-06

Compare power-law fit and the logistic functione and display the results

anova(nls_fit,nls_logis)
newdat = expand.grid(Leeftijd = seq(0, 100, by = 1))
newdat$lin <-predict(lin_fit, newdata = newdat)
newdat$pta_logistic <- predict(nls_logis,newdata = newdat)
newdat$pta_power <- predict(nls_fit,newdata = newdat)
#newdat
ggplot(lccl, aes(x=Leeftijd, y=PTA54ADS)) +
  geom_point(aes(colour = factor(group))) +
  geom_line(data = newdat, aes(y = lin), size = 1, col='blue') +
  geom_line(data = newdat, aes(y = pta_logistic), size = 1) +
  geom_line(data = newdat, aes(y = pta_power), size = 1, col='red') +
 
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 
## Warning: Removed 14 row(s) containing missing values (geom_path).
## Warning: Removed 11 row(s) containing missing values (geom_path).

  dev.print(pdf, '../results/pta_age_lccl_fits.pdf')
## quartz_off_screen 
##                 2

As we can see, the logistic function (SSlogis) describes the data better than the power-law function (F = 18,9; p = 1.6 e-5) This function has also been used in desribing the (frequency-specific) thresholds in Pauw et al., 2011 and will used in the subsequent sections.

Group comparison

The main questions is whether the function that describes the PTA (dB HL) as a function of age (years) differs between the groups @ref(fig:plot_pta_age_groups).

Start with a group-fit; discarding grouping information

fit0 <- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal), data=data)
summary(fit0)
## 
## Formula: PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal)
## 
## Parameters:
##      Estimate Std. Error t value Pr(>|t|)    
## Asym  135.557     12.188   11.12   <2e-16 ***
## xmid   59.258      3.517   16.85   <2e-16 ***
## scal   17.807      1.639   10.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.78 on 713 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 2.948e-06
coef(fit0)
##      Asym      xmid      scal 
## 135.55740  59.25776  17.80669

Now, add a grouping-variable with the mid-point (xmid)

# https://stats.stackexchange.com/questions/27273/how-do-i-fit-a-nonlinear-mixed-effects-model-for-repeated-measures-data-using-nl
# https://stats.stackexchange.com/questions/316801/how-to-compare-logistic-regression-curves
fit1 <- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid[group], scal), 
            data=data,
            start=list(Asym=rep(120,1), xmid=rep(50,3), scal=rep(15,1)))
summary(fit1)
## 
## Formula: PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid[group], scal)
## 
## Parameters:
##       Estimate Std. Error t value Pr(>|t|)    
## Asym   116.114      6.242   18.60   <2e-16 ***
## xmid1   54.188      1.813   29.89   <2e-16 ***
## xmid2   43.644      3.462   12.61   <2e-16 ***
## xmid3   34.631      3.185   10.87   <2e-16 ***
## scal    14.394      1.127   12.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.13 on 711 degrees of freedom
## 
## Number of iterations to convergence: 7 
## Achieved convergence tolerance: 4.654e-06

And add the scaling [scal] as a grouping variable; does it futher explain differences between groups?

fit2 <- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid[group], scal[group]), 
            data=data,
            start=list(Asym=rep(120,1), xmid=rep(50,3), scal=rep(15,3)))
summary(fit2)
## 
## Formula: PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid[group], scal[group])
## 
## Parameters:
##       Estimate Std. Error t value Pr(>|t|)    
## Asym   123.891      8.563  14.468  < 2e-16 ***
## xmid1   56.304      2.381  23.652  < 2e-16 ***
## xmid2   49.833      6.541   7.618 8.24e-14 ***
## xmid3   37.568      5.595   6.715 3.86e-11 ***
## scal1   15.084      1.291  11.682  < 2e-16 ***
## scal2   24.884      8.419   2.956  0.00322 ** 
## scal3   26.860      6.824   3.936 9.09e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.05 on 709 degrees of freedom
## 
## Number of iterations to convergence: 6 
## Achieved convergence tolerance: 2.195e-06
plot(profile(fit2))

And now add the asymptotic value of the fit (Leeftijd -> infinity) (Asym):

fit3 <- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym[group], xmid[group], scal[group]), 
            data=data,
            start=list(Asym=rep(120,3), xmid=rep(50,3), scal=rep(15,3)))
summary(fit3)
## 
## Formula: PTA54ADS ~ SSlogis(Leeftijd, Asym[group], xmid[group], scal[group])
## 
## Parameters:
##       Estimate Std. Error t value Pr(>|t|)    
## Asym1  126.350      9.361  13.497  < 2e-16 ***
## Asym2   89.933     60.449   1.488   0.1373    
## Asym3   97.280     10.751   9.049  < 2e-16 ***
## xmid1   56.974      2.587  22.025  < 2e-16 ***
## xmid2   34.989     26.673   1.312   0.1900    
## xmid3   27.148      4.666   5.819 8.99e-09 ***
## scal1   15.410      1.354  11.383  < 2e-16 ***
## scal2   17.424     17.929   0.972   0.3315    
## scal3   13.988      5.807   2.409   0.0163 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.05 on 707 degrees of freedom
## 
## Number of iterations to convergence: 8 
## Achieved convergence tolerance: 3.258e-06

Now test the various models. Which of the parameters explain the data best?

anova(fit0,fit1,fit2,fit3)

It turns out the both the variables [xmid] and [scale], i.e. the midpoint and slope at the midpoint significantly differ between the three groups, but that adding the asymptotic value does not describe the data signigicantly better (F=0.89, p=0.41).

newdat = expand.grid(Leeftijd = seq(0, 100, by = 1), group = c("LCCL","vWFA1","vWFA2"))
newdat$fit <- predict(fit2, newdata = newdat)
ggplot(data, aes(x=Leeftijd, y=PTA54ADS, group = pid, color = group)) +
  geom_point(aes(colour = factor(group)),alpha = .4) +
  geom_line(data=data, size=1, alpha = .2) +
  geom_line(data = newdat, aes(y = fit, group = group, colour = factor(group)), size = 1) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 

  dev.print(pdf, '../results/pta_age_pid_groups_fits.pdf')
## quartz_off_screen 
##                 2

Now perform fit on individual data by subsetting the data to keep individuals with more that 4 longitudinal datapoints. Is it the case that using a non-linear mixed-model approach may help us?

#https://stackoverflow.com/questions/14439770/filter-rows-in-dataframe-by-number-of-rows-per-level-of-a-factor
pidlengths <- ave(as.numeric(data$pid), 
                     data$pid, FUN = length)
#df2 <- lccl[pidlengths > 5, ]
df2 <- data[pidlengths > 2, ]
with(df2, table(group))
## group
##  LCCL vWFA1 vWFA2 
##   459     3     6
models <- nlsList(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal) | pid, data = df2)
## Warning: 37 errors caught in (attr(object, "initial"))(mCall = mCall, data = data, LHS = LHS).  The error messages and their frequencies are
## 
##                                     singular matrix 'a' in solve 
##                                                                1 
##                                                singular gradient 
##                                                                2 
## step factor 0.000488281 reduced below 'minFactor' of 0.000976562 
##                                                               12 
##            too few distinct input values to fit a logistic model 
##                                                               22

As we can see, some model-predictions failed; they end up with NaNs in the model fit list (nlslist); see e.g. pid 147

data_id <- subset(df2, pid=="147")
data_id
ggplot(data=data_id,  aes(x=Leeftijd, y=PTA54ADS)) +
  geom_point() +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 

Predict for all pid’s the fit to the model and remove the pid’s that give NaNs. Check how many subjects per group we end up with.

df2$Pred <-predict(models)
df2_na <- na.omit(df2)
with(df2_na, table(group, pid))
##        pid
## group   17 18 19 23 24 25 26 27 28 29 43 45 69 99 113 116 117 146 154 160 163
##   LCCL   6  6  6  8  6  6  9  6  9  5  4  8  7  7   5   5   7  10  10   5   8
##   vWFA1  0  0  0  0  0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0
##   vWFA2  0  0  0  0  0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0
##        pid
## group   165 174 175 182 186 187 202 203 207 209 215 218 220 222 223 225 230 234
##   LCCL    4   7   8   8   5   6   5   5   5   4   8   6  11  10  11   9  14  10
##   vWFA1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   vWFA2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        pid
## group   251 255 267 268
##   LCCL    8   7   8   5
##   vWFA1   0   0   0   0
##   vWFA2   0   0   0   0

We only keep the pid’s from the LCCL group. The pid’s in the other groups are not properly fitted.

le <- unique(df2_na$pid)
#le
newdat = expand.grid(Leeftijd = seq(0, 100, by = 1), pid=le)
newdat$prednlm <-predict(models, newdata=newdat)
#https://stackoverflow.com/questions/37122994/plotting-a-list-of-non-linear-regressions-with-ggplot
#https://aosmith.rbind.io/2018/11/16/plot-fitted-lines/
ip <- ggplot(data=df2_na,  aes(x=Leeftijd, y=PTA54ADS, colour = pid)) +
  geom_point() +
  geom_line(data=newdat,aes(y=prednlm)) +
  facet_wrap(~pid) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,25)) +
  scale_y_reverse(breaks=seq(0,120,40), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light()
ip

dev.print(pdf, '../results/pta_age_pid_lccl_ind_fits.pdf')
## quartz_off_screen 
##                 2

So, it seeems we can fit the data for individual subjects by some extent. It often ‘fails’ by over- or underestimating the tail (coef.lmlist Asym column). We can also plot it all in one figure.

ggplot(data=df2_na,  aes(x=Leeftijd, y=PTA54ADS, colour = pid)) +
  geom_point() +
  geom_line(data=newdat,aes(y=prednlm, group = pid)) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 
## Warning: Removed 176 row(s) containing missing values (geom_path).

  dev.print(pdf, '../results/pta_age_pid_lccl_ind_fits_overlay.pdf')
## quartz_off_screen 
##                 2

Feed the remaining data into the non-linear mixed-models with the parameters Asym, xmid, and scal as random factors.

nm1 <- nlmer(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal) ~ Asym + xmid + scal | pid, df2_na, start = c(Asym = 100, xmid = 60, scal = 15), corr = FALSE)
summary(nm1)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal) ~ Asym + xmid +  
##     scal | pid
##    Data: df2_na
## 
##      AIC      BIC   logLik deviance df.resid 
##   2152.6   2189.9  -1066.3   2132.6      297 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.14923 -0.50051 -0.03471  0.44682  2.94717 
## 
## Random effects:
##  Groups   Name Variance Std.Dev. Corr       
##  pid      Asym 166.742  12.913              
##           xmid  44.562   6.675    0.23      
##           scal   4.541   2.131   -0.53  0.11
##  Residual       24.231   4.922              
## Number of obs: 307, groups:  pid, 43
## 
## Fixed effects:
##      Estimate Std. Error t value
## Asym 113.0219     3.6536   30.93
## xmid  52.5762     1.1214   46.88
## scal   8.4703     0.5342   15.86
## 
## Correlation of Fixed Effects:
##      Asym  xmid 
## xmid 0.409      
## scal 0.343 0.265
coef(nm1)
## $pid
##          Asym     xmid      scal
## 17  111.91187 65.06896  9.290262
## 18  111.91187 65.06896  9.290262
## 19  118.54557 61.92052  7.638449
## 23  105.58459 49.24825  8.413491
## 24  108.87821 59.45191  9.966515
## 25  120.90609 54.73558  7.417888
## 26  109.08970 44.38473  7.839455
## 27  104.65608 46.09939  8.880052
## 28  113.29662 39.58856  5.379525
## 29  110.04858 52.98010  9.196106
## 43  113.90947 55.69913  8.459887
## 45   97.34802 45.33117 11.001626
## 69  126.47538 55.14536  5.067873
## 99  126.47538 55.14536  5.067873
## 113 123.85066 40.51042  7.292063
## 116 108.56522 47.02774  9.350318
## 117 118.05750 48.05015  6.749991
## 146 134.07677 52.99453  5.877904
## 154 124.71754 54.34416  5.675409
## 160 109.08630 48.12987  9.147920
## 163 117.51108 56.10667  7.451837
## 165  99.26396 50.66147 10.224177
## 174 102.01522 57.42787 10.467515
## 175  97.48613 60.86962 11.260595
## 182  98.61739 53.12833 10.230050
## 186 116.11258 59.74771  8.184632
## 187 108.57918 45.24103  8.603256
## 202 118.30537 48.75103  6.916644
## 203 106.80804 56.87895  9.709881
## 207 117.38917 46.67863  7.318511
## 209 117.73015 48.63513  6.977685
## 215 107.53529 48.52569  9.239854
## 218 107.19360 48.27288  9.002671
## 220 115.49471 52.50751  8.928806
## 222 118.37714 57.36307  8.206486
## 223 106.36951 52.52182  8.571337
## 225 102.56742 57.96347 10.083785
## 230 126.08938 46.73509  4.317173
## 234 106.64376 50.20636 10.065763
## 251 120.23932 48.56520  5.913720
## 255 102.15156 53.94762 10.178729
## 267 141.71632 68.79567  5.386864
## 268 126.58949 58.22265  6.962851
## 
## attr(,"class")
## [1] "coef.mer"
require(lattice) 
## Loading required package: lattice
qqmath(ranef(nm1, condVar=TRUE))
## $pid

Frequencey-specific analyses

Now subset the data to contain individual frequencies

data_all <- subset(data_subset, select=c('pid', 'group', 'Leeftijd', '250.AD','500.AD', '1000.AD','2000.AD','4000.AD','8000.AD', '250.AS','500.AS', '1000.AS','2000.AS','4000.AS','8000.AS'))
head(data_all)

Convert ‘wide’ dataset into ‘long’ format using tidyr and remove NaNs

tidier <- data_all %>%
  gather(f, dB, -pid, -group, -Leeftijd)
data_all_l <- tidier %>%
  separate(f, into = c("frequency", "ear"), sep = "\\.")
#head(data_all_l)
data_all_l$frequency = factor(data_all_l$frequency)
data_all_l$ear= factor(data_all_l$ear)
data_all_l <- na.omit(data_all_l)
#head(data_all_l)
p <- data_all_l%>%
  mutate(frequency = fct_relevel(frequency,"250", "500", "1000", "2000", "4000", "8000")) %>%
  ggplot(aes(x = frequency, y = dB)) +
    #geom_bar(stat="identity") +
    #geom_histogram() +
    geom_violin() + 
    facet_wrap(~ group, ncol = 3) +
    #geom_point()
    xlab("Frequency (Hz)") +
    ylab("Hearing level (dB)") +
    scale_y_reverse(limits=c(130,-10)) 
    #theme_classic()
p

simple linear model: PTA is a function of the affected domain; here are two levels in this mixed model; 1: timepoints for each patient; 2: genetic domain, with domain the fixed effect and patient the random effect allowing the intercept to vary across patient (~1|pid).

random_intercept <-lme(dB ~ frequency , 
            random = ~1|pid,   #p. 896
            method = "ML", 
            na.action = na.exclude, 
            control = list(opt="optim"),
            correlation = corAR1(),  #see p.897; timepoints are not equally spaced;use corCAR1 
            data = data_all_l)
summary(random_intercept)
## Linear mixed-effects model fit by maximum likelihood
##  Data: data_all_l 
##     AIC      BIC logLik
##   70816 70879.26 -35399
## 
## Random effects:
##  Formula: ~1 | pid
##         (Intercept) Residual
## StdDev:    30.66465 17.63885
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | pid 
##  Parameter estimate(s):
##       Phi 
## 0.4384654 
## Fixed effects: dB ~ frequency 
##                  Value Std.Error   DF  t-value p-value
## (Intercept)   43.50693 1.9538723 8068 22.26703  0.0000
## frequency2000  6.76735 0.6766851 8068 10.00074  0.0000
## frequency250  -2.45973 0.7709764 8068 -3.19041  0.0014
## frequency4000 22.58725 0.7647918 8068 29.53386  0.0000
## frequency500  -1.96178 0.6766216 8068 -2.89937  0.0037
## frequency8000 30.51271 0.7951280 8068 38.37459  0.0000
##  Correlation: 
##               (Intr) fr2000 frq250 fr4000 frq500
## frequency2000 -0.173                            
## frequency250  -0.197  0.410                     
## frequency4000 -0.196  0.565  0.479              
## frequency500  -0.174  0.362  0.561  0.414       
## frequency8000 -0.201  0.454  0.540  0.612  0.436
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -4.2543260 -0.5913403  0.0373855  0.6194437  3.9226279 
## 
## Number of Observations: 8347
## Number of Groups: 274
anova(random_intercept)

now add Leeftijd as fixed effect; PTA ~ Domain + Leeftijd` (see. e.g. p.897)

timeRI <- update(random_intercept, .~. + Leeftijd)
summary(timeRI)
## Linear mixed-effects model fit by maximum likelihood
##  Data: data_all_l 
##        AIC      BIC    logLik
##   67842.65 67912.94 -33911.32
## 
## Random effects:
##  Formula: ~1 | pid
##         (Intercept) Residual
## StdDev:   0.1104064 24.88384
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | pid 
##  Parameter estimate(s):
##       Phi 
## 0.8322339 
## Fixed effects: dB ~ frequency + Leeftijd 
##                   Value Std.Error   DF   t-value p-value
## (Intercept)   -50.31584 1.6018876 8067 -31.41034  0.0000
## frequency2000   5.13859 0.5814116 8067   8.83812  0.0000
## frequency250   -1.30225 0.7394264 8067  -1.76116  0.0782
## frequency4000  20.24899 0.7377572 8067  27.44668  0.0000
## frequency500   -0.94485 0.5808559 8067  -1.62665  0.1038
## frequency8000  29.31740 0.8033935 8067  36.49196  0.0000
## Leeftijd        1.93967 0.0258142 8067  75.13958  0.0000
##  Correlation: 
##               (Intr) fr2000 frq250 fr4000 frq500 fr8000
## frequency2000 -0.196                                   
## frequency250  -0.228  0.283                            
## frequency4000 -0.262  0.635  0.419                     
## frequency500  -0.176  0.195  0.631  0.284              
## frequency8000 -0.304  0.464  0.551  0.713  0.369       
## Leeftijd      -0.819  0.018 -0.023  0.032 -0.015  0.049
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.07541243 -0.70824164 -0.07201688  0.58548467  5.01653320 
## 
## Number of Observations: 8347
## Number of Groups: 274
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lattice_0.20-40 dplyr_0.8.4     tidyr_1.0.2     forcats_0.5.0  
##  [5] knitr_1.28      lme4_1.1-21     Matrix_1.2-18   nlme_3.1-145   
##  [9] readxl_1.3.1    sjPlot_2.8.2    drc_3.0-1       MASS_7.3-51.5  
## [13] ggplot2_3.3.0  
## 
## loaded via a namespace (and not attached):
##  [1] jsonlite_1.6.1    splines_3.6.0     carData_3.0-3     modelr_0.1.6     
##  [5] gtools_3.8.1      assertthat_0.2.1  highr_0.8         cellranger_1.1.0 
##  [9] yaml_2.2.1        bayestestR_0.5.2  pillar_1.4.3      backports_1.1.5  
## [13] glue_1.4.0        digest_0.6.25     minqa_1.2.4       colorspace_1.4-1 
## [17] sandwich_2.5-1    htmltools_0.4.0   pkgconfig_2.0.3   broom_0.5.5      
## [21] haven_2.2.0       purrr_0.3.3       xtable_1.8-4      mvtnorm_1.1-0    
## [25] scales_1.1.0      openxlsx_4.1.4    rio_0.5.16        emmeans_1.4.5    
## [29] tibble_2.1.3      generics_0.0.2    farver_2.0.3      car_3.0-6        
## [33] ellipsis_0.3.0    sjlabelled_1.1.3  TH.data_1.0-10    withr_2.1.2      
## [37] survival_3.1-8    magrittr_1.5      crayon_1.3.4      effectsize_0.2.0 
## [41] estimability_1.3  evaluate_0.14     foreign_0.8-76    tools_3.6.0      
## [45] data.table_1.12.8 hms_0.5.3         lifecycle_0.2.0   multcomp_1.4-12  
## [49] stringr_1.4.0     munsell_0.5.0     plotrix_3.7-7     zip_2.0.4        
## [53] ggeffects_0.14.1  compiler_3.6.0    rlang_0.4.5       grid_3.6.0       
## [57] nloptr_1.2.1      parameters_0.5.0  labeling_0.3      rmarkdown_2.1    
## [61] boot_1.3-24       gtable_0.3.0      codetools_0.2-16  abind_1.4-5      
## [65] sjstats_0.17.9    curl_4.3          sjmisc_2.8.3      R6_2.4.1         
## [69] zoo_1.8-7         performance_0.4.4 insight_0.8.1     stringi_1.4.6    
## [73] Rcpp_1.0.4.6      vctrs_0.2.4       tidyselect_1.0.0  xfun_0.12

Code Appendix

library(ggplot2)
library(drc)
library(sjPlot)
library("readxl")
library("nlme")
library(lme4)
library(knitr)
library(forcats)
library(tidyr)
library(dplyr)
data_raw <- read_excel("../data/raw_data/database_20-04-2020.xlsx")
data_raw$group = factor(data_raw$Domain)
#leave out data with only n=1 dataset per domain/certain unpublished data.
data_subset <- subset(data_raw, Smits=='no' & Domainrec!=1 & group!="Ivd1")  
data <- subset(data_subset, select=c('pid', 'group', 'Leeftijd', 'PTA54ADS'))
data <- droplevels(data)
#data_raw %>% group_by(group) %>% summarize(count=n())
#head(data)
#check for NaNs in PTA and Leeftijd, should be 0.
nrow(data[is.na(data$PTA54ADS) | is.na(data$Leeftijd), ])
#number (n) of counts (i.e. audiograms) per subject (pid)
summarytable <- data %>% count(group, pid)
num_meas_per_id <- aggregate(PTA54ADS ~ pid , data, function(x) length(unique(x)))
t1 <- table(num_meas_per_id$PTA54ADS)
kable(t1, caption = "Table 1. The number of subjects that each have n audiograms", col.names = c("# audiograms","# subjects"))
ggplot(data=summarytable, aes(x=n,fill=group)) + 
  geom_histogram(binwidth=1) +
  facet_wrap(~group) +
  xlab("# of measurements") +
  ylab("Frequency (# patients)") 
  dev.print(pdf, '../results/histogram_number_meas_pid.pdf')
ggplot(data, aes(x = Leeftijd, y = PTA54ADS, group = pid, color = group)) + 
  #xlab("Age (years)") +
  #ylab("PTA (dB HL)") +
  geom_point(aes(colour = factor(group))) +
  geom_line(data=data, size=1, alpha = .3) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 
  dev.print(pdf, '../results/pta_age_pid_groups.pdf')
lccl = subset(data, group=="LCCL")
ggplot(lccl, aes(x=Leeftijd, y=PTA54ADS), group=pid, color=group) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  geom_point(aes(colour = factor(group))) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  theme_light()
  dev.print(pdf, '../results/pta_age_pid_lccl.pdf')
lin_fit <- nls(PTA54ADS ~ a*Leeftijd + b, lccl)
summary(lin_fit)
nls_fit <- nls(PTA54ADS ~ a*Leeftijd^b, lccl, start = list(a = 0.05, b = 1.5))
summary(nls_fit)
startvec <- c(Asym = 120, xmid = 50, scal = 15)
nls_logis<- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal),
                 data=lccl,
                 start = startvec)
summary(nls_logis)
anova(nls_fit,nls_logis)
newdat = expand.grid(Leeftijd = seq(0, 100, by = 1))
newdat$lin <-predict(lin_fit, newdata = newdat)
newdat$pta_logistic <- predict(nls_logis,newdata = newdat)
newdat$pta_power <- predict(nls_fit,newdata = newdat)
#newdat
ggplot(lccl, aes(x=Leeftijd, y=PTA54ADS)) +
  geom_point(aes(colour = factor(group))) +
  geom_line(data = newdat, aes(y = lin), size = 1, col='blue') +
  geom_line(data = newdat, aes(y = pta_logistic), size = 1) +
  geom_line(data = newdat, aes(y = pta_power), size = 1, col='red') +
 
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 
  dev.print(pdf, '../results/pta_age_lccl_fits.pdf')

fit0 <- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal), data=data)
summary(fit0)
coef(fit0)
# https://stats.stackexchange.com/questions/27273/how-do-i-fit-a-nonlinear-mixed-effects-model-for-repeated-measures-data-using-nl
# https://stats.stackexchange.com/questions/316801/how-to-compare-logistic-regression-curves
fit1 <- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid[group], scal), 
            data=data,
            start=list(Asym=rep(120,1), xmid=rep(50,3), scal=rep(15,1)))
summary(fit1)
fit2 <- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid[group], scal[group]), 
            data=data,
            start=list(Asym=rep(120,1), xmid=rep(50,3), scal=rep(15,3)))
summary(fit2)
plot(profile(fit2))
fit3 <- nls(PTA54ADS ~ SSlogis(Leeftijd, Asym[group], xmid[group], scal[group]), 
            data=data,
            start=list(Asym=rep(120,3), xmid=rep(50,3), scal=rep(15,3)))
summary(fit3)
anova(fit0,fit1,fit2,fit3)
newdat = expand.grid(Leeftijd = seq(0, 100, by = 1), group = c("LCCL","vWFA1","vWFA2"))
newdat$fit <- predict(fit2, newdata = newdat)

ggplot(data, aes(x=Leeftijd, y=PTA54ADS, group = pid, color = group)) +
  geom_point(aes(colour = factor(group)),alpha = .4) +
  geom_line(data=data, size=1, alpha = .2) +
  geom_line(data = newdat, aes(y = fit, group = group, colour = factor(group)), size = 1) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 
  dev.print(pdf, '../results/pta_age_pid_groups_fits.pdf')
#https://stackoverflow.com/questions/14439770/filter-rows-in-dataframe-by-number-of-rows-per-level-of-a-factor
pidlengths <- ave(as.numeric(data$pid), 
                     data$pid, FUN = length)
#df2 <- lccl[pidlengths > 5, ]
df2 <- data[pidlengths > 2, ]
with(df2, table(group))
models <- nlsList(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal) | pid, data = df2)
data_id <- subset(df2, pid=="147")
data_id
ggplot(data=data_id,  aes(x=Leeftijd, y=PTA54ADS)) +
  geom_point() +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 
df2$Pred <-predict(models)
df2_na <- na.omit(df2)
with(df2_na, table(group, pid))
le <- unique(df2_na$pid)
#le
newdat = expand.grid(Leeftijd = seq(0, 100, by = 1), pid=le)
newdat$prednlm <-predict(models, newdata=newdat)
#https://stackoverflow.com/questions/37122994/plotting-a-list-of-non-linear-regressions-with-ggplot
#https://aosmith.rbind.io/2018/11/16/plot-fitted-lines/
ip <- ggplot(data=df2_na,  aes(x=Leeftijd, y=PTA54ADS, colour = pid)) +
  geom_point() +
  geom_line(data=newdat,aes(y=prednlm)) +
  facet_wrap(~pid) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,25)) +
  scale_y_reverse(breaks=seq(0,120,40), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light()
ip
dev.print(pdf, '../results/pta_age_pid_lccl_ind_fits.pdf')
ggplot(data=df2_na,  aes(x=Leeftijd, y=PTA54ADS, colour = pid)) +
  geom_point() +
  geom_line(data=newdat,aes(y=prednlm, group = pid)) +
  geom_hline(yintercept=0,linetype="dashed") +
  scale_x_continuous(breaks=seq(0,100,20)) +
  scale_y_reverse(breaks=seq(-10,130,20), limits=c(130,-10)) +
  #scale_y_reverse(limits=c(130,-10)) +
  xlab("Age (years)") +
  ylab("PTA (dB HL)") +
  theme_light() 
  dev.print(pdf, '../results/pta_age_pid_lccl_ind_fits_overlay.pdf')
nm1 <- nlmer(PTA54ADS ~ SSlogis(Leeftijd, Asym, xmid, scal) ~ Asym + xmid + scal | pid, df2_na, start = c(Asym = 100, xmid = 60, scal = 15), corr = FALSE)
summary(nm1)
coef(nm1)
require(lattice) 
qqmath(ranef(nm1, condVar=TRUE))

data_all <- subset(data_subset, select=c('pid', 'group', 'Leeftijd', '250.AD','500.AD', '1000.AD','2000.AD','4000.AD','8000.AD', '250.AS','500.AS', '1000.AS','2000.AS','4000.AS','8000.AS'))
head(data_all)
tidier <- data_all %>%
  gather(f, dB, -pid, -group, -Leeftijd)
data_all_l <- tidier %>%
  separate(f, into = c("frequency", "ear"), sep = "\\.")
#head(data_all_l)
data_all_l$frequency = factor(data_all_l$frequency)
data_all_l$ear= factor(data_all_l$ear)
data_all_l <- na.omit(data_all_l)
#head(data_all_l)
p <- data_all_l%>%
  mutate(frequency = fct_relevel(frequency,"250", "500", "1000", "2000", "4000", "8000")) %>%
  ggplot(aes(x = frequency, y = dB)) +
    #geom_bar(stat="identity") +
    #geom_histogram() +
    geom_violin() + 
    facet_wrap(~ group, ncol = 3) +
    #geom_point()
    xlab("Frequency (Hz)") +
    ylab("Hearing level (dB)") +
    scale_y_reverse(limits=c(130,-10)) 
    #theme_classic()
p
random_intercept <-lme(dB ~ frequency , 
            random = ~1|pid,   #p. 896
            method = "ML", 
            na.action = na.exclude, 
            control = list(opt="optim"),
            correlation = corAR1(),  #see p.897; timepoints are not equally spaced;use corCAR1 
            data = data_all_l)
summary(random_intercept)
anova(random_intercept)
timeRI <- update(random_intercept, .~. + Leeftijd)
summary(timeRI)
sessionInfo()